# Import necessary libraries
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import interp1d
import yaml
import os
import statsmodels.stats.api as sms
import scanpy as sc
import plotly.express as px
# Load the imaging data
figDir = "./figures"
if not os.path.exists(figDir):
os.makedirs(figDir)
tablesDir = "./tables"
if not os.path.exists(tablesDir):
os.makedirs(tablesDir)
outdir = "../data/output"
# Load colors dict
with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
colorsmap = dict(zip([i["newName"] for i in iPSC_lines_map.values()],[i["color"] for i in iPSC_lines_map.values()]))
dataImaging=pd.read_csv('../data/imaging_data/organoidMultiplexing_growthCurves_quant.csv')
# Filter and reformat the imaging data
dataImaging = dataImaging.loc[dataImaging["FileName"].str.contains("MIX"), ["Line","Day","LineRep","EquivalentDiameterArea(microns)"]]
dataImaging["replicate"] = dataImaging["LineRep"].str.split("_",expand=True)[1].astype(int)
dataImaging = dataImaging[["Line","Day","replicate","EquivalentDiameterArea(microns)"]].rename(columns={"Line":"Mix"})
# removing incomplete mix 7
dataImaging = dataImaging[dataImaging["Mix"] != "MIX7"]
dataImaging
| Mix | Day | replicate | EquivalentDiameterArea(microns) | |
|---|---|---|---|---|
| 2 | MIX8 | 0 | 3 | 463.653039 |
| 3 | MIX6 | 0 | 4 | 471.100434 |
| 5 | MIX8 | 0 | 4 | 480.015620 |
| 7 | MIX1 | 0 | 3 | 355.681447 |
| 11 | MIX5 | 0 | 1 | 459.539955 |
| ... | ... | ... | ... | ... |
| 585 | MIX2 | 10 | 1 | 1211.001565 |
| 588 | MIX1 | 10 | 1 | 844.396092 |
| 590 | MIX5 | 10 | 1 | 958.028792 |
| 591 | MIX4 | 10 | 3 | 898.788100 |
| 597 | MIX5 | 10 | 4 | 981.861506 |
209 rows × 4 columns
dataCensus=pd.read_csv('../data/census_seq/CensusSeq_data.csv')
# Filter and reformat the census data
dataCensus["Timepoint"] = dataCensus["Timepoint"].str.split(" ", expand= True)[1].astype(int)
dataCensus = dataCensus[["Sample name","DONOR","REPRESENTATION","MIX ID","Timepoint"]].rename(columns={"MIX ID":"Mix","Timepoint":"Day"})
dataCensus["Mix"] = "MIX"+dataCensus["Mix"].astype(str)
# removing incomplete mix 7
dataCensus = dataCensus[dataCensus["Mix"] != "MIX7"]
dataCensus["DONOR"] = dataCensus["DONOR"].replace({"CTL01A":"CTL01"})
dataCensus["DONOR"] = dataCensus["DONOR"].replace({"CHD2WT":"UCSFi001-A"})
dataCensus["DONOR"] = dataCensus["DONOR"].replace({"CHD8WT":"H9"})
dataCensus
| Sample name | DONOR | REPRESENTATION | Mix | Day | |
|---|---|---|---|---|---|
| 0 | M18 | CTL01 | 0.94993 | MIX4 | 5 |
| 1 | M18 | CTL02A | 0.00285 | MIX4 | 5 |
| 2 | M18 | CTL05A | 0.00750 | MIX4 | 5 |
| 3 | M18 | CTL07C | 0.01086 | MIX4 | 5 |
| 4 | M18 | H1 | 0.00647 | MIX4 | 5 |
| ... | ... | ... | ... | ... | ... |
| 577 | M60 | H9 | 0.30743 | MIX3 | 25 |
| 578 | M60 | CTL04E | 0.01336 | MIX3 | 25 |
| 579 | M60 | CTL06F | 0.24393 | MIX3 | 25 |
| 580 | M60 | CTL08A | 0.02211 | MIX3 | 25 |
| 581 | M60 | CTL09A | 0.40072 | MIX3 | 25 |
546 rows × 5 columns
# Define the density volume
densityVolume = 0.000767495
# Transform diameter to cells
dataImaging["EquivalentSphereVolume"] = (np.power((dataImaging["EquivalentDiameterArea(microns)"]/2), 3))*math.pi*(4/3)
dataImaging["nCells"] = dataImaging["EquivalentSphereVolume"]*densityVolume
#%%
# Compute DeltaGrowth
GrowthRate = pd.DataFrame()
# Loop over each unique mix in the census data
for m in dataCensus.Mix.unique().tolist():
GrowthRateMix = pd.DataFrame()
# Filter the data for the current mix
DataMix = dataCensus[dataCensus["Mix"] == m]
DataMix = DataMix[DataMix["Day"].isin([5,12,-2])]
DataMix["ClosestDay"] = DataMix["Day"].replace({5:4,12:10,-2:0})
TPTS = np.sort(DataMix.ClosestDay.unique())
# Loop over each unique timepoint for the current mix
for t in TPTS:
# Compute the average representation for each donor at the current timepoint
DataMixTPT = DataMix[DataMix["ClosestDay"] == t].groupby("DONOR")["REPRESENTATION"].mean().to_frame().T.reset_index(drop=True)
DataMixTPT.columns = ["AvgCensus."+i for i in DataMixTPT.columns.tolist()]
# Compute the mean and standard deviation of the number of cells for the current mix and timepoint
Mean = round(dataImaging.loc[(dataImaging["Day"] == t) &(dataImaging["Mix"] == m),"nCells"].mean(),2)
std = round(dataImaging.loc[(dataImaging["Day"] == t) &(dataImaging["Mix"] == m),"nCells"].std(),2)
# Create a dataframe with the computed values
NcellsDF = pd.DataFrame({"Mix":m,"Day":t,"MeanCells":Mean, "STDcells":std}, index=[0])
NcellsDF = pd.concat([NcellsDF,DataMixTPT], axis = 1)
# Append the dataframe to the growth rate dataframe for the current mix
GrowthRateMix = pd.concat([GrowthRateMix,NcellsDF], ignore_index=True)
# If there are at least two timepoints for the current mix, compute the growth rate
if GrowthRateMix.shape[0] >= 2:
# Compute the growth rate and add it to the growth rate dataframe
GrowthRateMix = GrowthRateMix.sort_values("Day")
GrowThaRateVals = GrowthRateMix.T.loc[DataMixTPT.columns]
GrowThaRateVals = GrowThaRateVals.T.div(GrowThaRateVals.T.shift(1)).T.dropna(axis = 1)
GrowThaRateVals.index = ["GR."+i.split(".")[1] for i in GrowThaRateVals.index]
GrowThaRateVals.columns = ["D{}_to_D{}_growthRate".format(GrowthRateMix.loc[i,"Day"],GrowthRateMix.loc[i+1,"Day"]) for i in GrowthRateMix.sort_values("Day").index.tolist()[:-1]]
GrowThaRateVals["Mix"] = m
GrowThaRateVals["DONOR"] = [i.split(".")[1] for i in GrowThaRateVals.index.tolist()]
# Add the mean and standard deviation of the number of cells and the average census to the growth rate dataframe
for d in GrowthRateMix.Day.unique():
GrowThaRateVals["MeanCells.D{}".format(d)] = GrowthRateMix.loc[GrowthRateMix["Day"] == d,"MeanCells"].values[0]
GrowThaRateVals["STDcells.D{}".format(d)] = GrowthRateMix.loc[GrowthRateMix["Day"] == d,"STDcells"].values[0]
AvgCensus = GrowthRateMix.loc[GrowthRateMix["Day"] == d, [i for i in GrowthRateMix.columns.tolist() if i.startswith("Avg")]].T
AvgCensus.index = [i.split(".")[1] for i in AvgCensus.index.tolist()]
GrowThaRateVals["AvgCensus.D{}".format(d)] = AvgCensus.loc[GrowThaRateVals.DONOR].to_numpy().flatten()
# Append the growth rate dataframe for the current mix to the overall growth rate dataframe
GrowthRate = pd.concat([GrowthRate,GrowThaRateVals], ignore_index=True)
GrowthRate
| D0_to_D4_growthRate | D4_to_D10_growthRate | Mix | DONOR | MeanCells.D0 | STDcells.D0 | AvgCensus.D0 | MeanCells.D4 | STDcells.D4 | AvgCensus.D4 | MeanCells.D10 | STDcells.D10 | AvgCensus.D10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4.172086 | 1.01204 | MIX4 | CTL01 | 31778.67 | 7785.07 | 0.22204 | 55127.37 | 4383.50 | 0.926370 | 312229.25 | 17148.21 | 0.937523 |
| 1 | 0.032907 | 1.397864 | MIX4 | CTL02A | 31778.67 | 7785.07 | 0.15174 | 55127.37 | 4383.50 | 0.004993 | 312229.25 | 17148.21 | 0.006980 |
| 2 | 0.07715 | 1.580955 | MIX4 | CTL05A | 31778.67 | 7785.07 | 0.13476 | 55127.37 | 4383.50 | 0.010397 | 312229.25 | 17148.21 | 0.016437 |
| 3 | 0.104393 | 0.996621 | MIX4 | CTL07C | 31778.67 | 7785.07 | 0.14174 | 55127.37 | 4383.50 | 0.014797 | 312229.25 | 17148.21 | 0.014747 |
| 4 | 0.038165 | 0.624666 | MIX4 | H1 | 31778.67 | 7785.07 | 0.16359 | 55127.37 | 4383.50 | 0.006243 | 312229.25 | 17148.21 | 0.003900 |
| 5 | 0.199842 | 0.548795 | MIX4 | KTD8_2 | 31778.67 | 7785.07 | 0.18613 | 55127.37 | 4383.50 | 0.037197 | 312229.25 | 17148.21 | 0.020413 |
| 6 | 3.768502 | 0.906179 | MIX5 | CTL01 | 48595.31 | 8011.44 | 0.20628 | 68553.09 | 7615.66 | 0.777367 | 377111.59 | 38194.70 | 0.704433 |
| 7 | 0.133898 | 0.521918 | MIX5 | CTL04E | 48595.31 | 8011.44 | 0.20558 | 68553.09 | 7615.66 | 0.027527 | 377111.59 | 38194.70 | 0.014367 |
| 8 | 0.137649 | 2.841229 | MIX5 | CTL05A | 48595.31 | 8011.44 | 0.13239 | 68553.09 | 7615.66 | 0.018223 | 377111.59 | 38194.70 | 0.051777 |
| 9 | 0.551949 | 2.552438 | MIX5 | CTL06F | 48595.31 | 8011.44 | 0.13279 | 68553.09 | 7615.66 | 0.073293 | 377111.59 | 38194.70 | 0.187077 |
| 10 | 0.039852 | 0.900892 | MIX5 | H1 | 48595.31 | 8011.44 | 0.16879 | 68553.09 | 7615.66 | 0.006727 | 377111.59 | 38194.70 | 0.006060 |
| 11 | 0.628373 | 0.374557 | MIX5 | UCSFi001-A | 48595.31 | 8011.44 | 0.15416 | 68553.09 | 7615.66 | 0.096870 | 377111.59 | 38194.70 | 0.036283 |
| 12 | 2.942335 | NaN | MIX8 | CTL01 | 41645.04 | 1760.62 | 0.30053 | 73605.42 | 3366.29 | 0.884260 | NaN | NaN | NaN |
| 13 | 0.021881 | NaN | MIX8 | CTL02A | 41645.04 | 1760.62 | 0.19469 | 73605.42 | 3366.29 | 0.004260 | NaN | NaN | NaN |
| 14 | 0.149044 | NaN | MIX8 | CTL04E | 41645.04 | 1760.62 | 0.23975 | 73605.42 | 3366.29 | 0.035733 | NaN | NaN | NaN |
| 15 | 0.285804 | NaN | MIX8 | CTL08A | 41645.04 | 1760.62 | 0.26503 | 73605.42 | 3366.29 | 0.075747 | NaN | NaN | NaN |
| 16 | 0.786518 | 0.757478 | MIX1 | CTL04E | 21740.41 | 9982.70 | 0.25248 | 40500.52 | 9223.00 | 0.198580 | 326966.32 | 56339.81 | 0.150420 |
| 17 | 0.248389 | 1.639404 | MIX1 | CTL05A | 21740.41 | 9982.70 | 0.16654 | 40500.52 | 9223.00 | 0.041367 | 326966.32 | 56339.81 | 0.067817 |
| 18 | 1.2327 | 2.985774 | MIX1 | CTL06F | 21740.41 | 9982.70 | 0.17678 | 40500.52 | 9223.00 | 0.217917 | 326966.32 | 56339.81 | 0.650650 |
| 19 | 0.040518 | 0.506822 | MIX1 | H1 | 21740.41 | 9982.70 | 0.20501 | 40500.52 | 9223.00 | 0.008307 | 326966.32 | 56339.81 | 0.004210 |
| 20 | 2.680004 | 0.237722 | MIX1 | UCSFi001-A | 21740.41 | 9982.70 | 0.19919 | 40500.52 | 9223.00 | 0.533830 | 326966.32 | 56339.81 | 0.126903 |
| 21 | 2.432402 | 0.382556 | MIX2 | CTL01 | 38089.55 | 3161.55 | 0.25723 | 96140.42 | 4002.24 | 0.625687 | 713617.89 | 196799.77 | 0.239360 |
| 22 | 0.064664 | 0.410425 | MIX2 | CTL02A | 38089.55 | 3161.55 | 0.17207 | 96140.42 | 4002.24 | 0.011127 | 713617.89 | 196799.77 | 0.004567 |
| 23 | 0.188459 | 0.289028 | MIX2 | CTL07C | 38089.55 | 3161.55 | 0.15556 | 96140.42 | 4002.24 | 0.029317 | 713617.89 | 196799.77 | 0.008473 |
| 24 | 0.362503 | 0.253184 | MIX2 | CTL08A | 38089.55 | 3161.55 | 0.25129 | 96140.42 | 4002.24 | 0.091093 | 713617.89 | 196799.77 | 0.023063 |
| 25 | 1.48166 | 2.984457 | MIX2 | CTL09A | 38089.55 | 3161.55 | 0.16385 | 96140.42 | 4002.24 | 0.242770 | 713617.89 | 196799.77 | 0.724537 |
| 26 | 0.113241 | 0.655055 | MIX3 | CTL04E | 50790.89 | 5151.37 | 0.17323 | 90932.18 | 2678.76 | 0.019617 | 526151.83 | 20702.51 | 0.012850 |
| 27 | 0.289329 | 3.071937 | MIX3 | CTL06F | 50790.89 | 5151.37 | 0.12620 | 90932.18 | 2678.76 | 0.036513 | 526151.83 | 20702.51 | 0.112167 |
| 28 | 0.343489 | 0.441506 | MIX3 | CTL08A | 50790.89 | 5151.37 | 0.20547 | 90932.18 | 2678.76 | 0.070577 | 526151.83 | 20702.51 | 0.031160 |
| 29 | 1.39796 | 1.84376 | MIX3 | CTL09A | 50790.89 | 5151.37 | 0.13462 | 90932.18 | 2678.76 | 0.188193 | 526151.83 | 20702.51 | 0.346983 |
| 30 | 3.2987 | 0.810998 | MIX3 | H9 | 50790.89 | 5151.37 | 0.17382 | 90932.18 | 2678.76 | 0.573380 | 526151.83 | 20702.51 | 0.465010 |
| 31 | 0.598539 | 0.284841 | MIX3 | UCSFi001-A | 50790.89 | 5151.37 | 0.18666 | 90932.18 | 2678.76 | 0.111723 | 526151.83 | 20702.51 | 0.031823 |
| 32 | 3.672488 | 0.534164 | MIX6 | CTL01 | 45944.60 | 6040.60 | 0.11796 | 83073.41 | 4017.45 | 0.433207 | 567830.49 | 50718.91 | 0.231403 |
| 33 | 0.001423 | 13.444444 | MIX6 | CTL02A | 45944.60 | 6040.60 | 0.08433 | 83073.41 | 4017.45 | 0.000120 | 567830.49 | 50718.91 | 0.001613 |
| 34 | 0.251951 | 0.535659 | MIX6 | CTL04E | 45944.60 | 6040.60 | 0.11149 | 83073.41 | 4017.45 | 0.028090 | 567830.49 | 50718.91 | 0.015047 |
| 35 | 0.232655 | 1.661672 | MIX6 | CTL05A | 45944.60 | 6040.60 | 0.08605 | 83073.41 | 4017.45 | 0.020020 | 567830.49 | 50718.91 | 0.033267 |
| 36 | 0.565313 | 2.113249 | MIX6 | CTL06F | 45944.60 | 6040.60 | 0.08518 | 83073.41 | 4017.45 | 0.048153 | 567830.49 | 50718.91 | 0.101760 |
| 37 | 0.305695 | 0.572213 | MIX6 | CTL07C | 45944.60 | 6040.60 | 0.08539 | 83073.41 | 4017.45 | 0.026103 | 567830.49 | 50718.91 | 0.014937 |
| 38 | 0.73663 | 0.370558 | MIX6 | CTL08A | 45944.60 | 6040.60 | 0.13064 | 83073.41 | 4017.45 | 0.096233 | 567830.49 | 50718.91 | 0.035660 |
| 39 | 2.242307 | 2.439426 | MIX6 | CTL09A | 45944.60 | 6040.60 | 0.09587 | 83073.41 | 4017.45 | 0.214970 | 567830.49 | 50718.91 | 0.524403 |
| 40 | 0.114095 | 0.711851 | MIX6 | H1 | 45944.60 | 6040.60 | 0.10798 | 83073.41 | 4017.45 | 0.012320 | 567830.49 | 50718.91 | 0.008770 |
| 41 | 1.270102 | 0.274423 | MIX6 | UCSFi001-A | 45944.60 | 6040.60 | 0.09510 | 83073.41 | 4017.45 | 0.120787 | 567830.49 | 50718.91 | 0.033147 |
fig, axes = plt.subplots(1,1, figsize=(15,5), dpi=300, sharex=False,sharey=False)
sns.boxplot(data=GrowthRate, x="DONOR", y="D0_to_D4_growthRate", whis=(0, 100),palette=colorsmap)
sns.stripplot(data=GrowthRate, x="DONOR", y="D0_to_D4_growthRate", size=10, color=".3")
axes.title.set_text("Growth rate stability from D0 to D4")
axes.set_xlabel("Cell line", fontsize=20)
axes.set_ylabel("D0 to D4 growth rate", fontsize=20)
sns.despine(offset=30)
plt.show()
fig.savefig(figDir+"/GrowthRateStability.0to4.svg", format='svg', bbox_inches='tight')
fig, axes = plt.subplots(1,1, figsize=(15,5), dpi=300, sharex=False,sharey=False)
sns.boxplot(data=GrowthRate, x="DONOR", y="D4_to_D10_growthRate", whis=(0, 100),palette=colorsmap)
sns.stripplot(data=GrowthRate, x="DONOR", y="D4_to_D10_growthRate", size=10, color=".3")
axes.title.set_text("Growth rate stability from D4 to D10")
axes.set_xlabel("Cell line", fontsize=20)
axes.set_ylabel("D4 to D10 growth rate", fontsize=20)
sns.despine(offset=30)
plt.show()
fig.savefig(figDir+"/GrowthRateStability.4to10.svg", format='svg', bbox_inches='tight')
# %%
sns.histplot(GrowthRate["D4_to_D10_growthRate"])
total=GrowthRate["D4_to_D10_growthRate"].tolist()
total.extend(GrowthRate["D0_to_D4_growthRate"].tolist())
sns.histplot(total)
<AxesSubplot: xlabel='D4_to_D10_growthRate', ylabel='Count'>
import matplotlib.pyplot as plt
# Assuming GrowthRate is a pandas DataFrame
D4_to_D10_growthRate = GrowthRate["D4_to_D10_growthRate"]
D0_to_D4_growthRate = GrowthRate["D0_to_D4_growthRate"]
plt.hist([D4_to_D10_growthRate, D0_to_D4_growthRate,total], label=['D4_to_D10_growthRate', 'D0_to_D4_growthRate', 'Total'], stacked=False,density=True)
plt.legend(loc='upper right')
plt.show()
sns.set_style("whitegrid")
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )
sns.despine(fig=ax.figure)
sns.histplot(total, stat='probability', ax = ax)
# Assume 'ax' is the axes object containing the histogram
ax = plt.gca()
# Get the rectangles representing the bars
patches = ax.patches
# Extract the heights (frequencies) and the left edges of the rectangles
frequencies = [patch.get_height() for patch in patches]
bin_edges = [patch.get_xy()[0] for patch in patches]
# Calculate the probabilities
probabilities = frequencies
# %%
mix_number=10
cell_dict={}
EB_cell_number=20000
total_cycles=10000
for i in range(1,total_cycles):
generated_data = np.random.choice(bin_edges, size=mix_number, p=probabilities)
cell_dict[i]=EB_cell_number*generated_data
simulated_exp=pd.DataFrame(cell_dict)
simulated_exp = simulated_exp.div(simulated_exp.sum(axis=0), axis=1)>0.05
sns.histplot(simulated_exp.sum(axis=0))
<AxesSubplot: ylabel='Count'>
min_x = 0
max_x = 6
sns.histplot(total, stat='probability')
# Assume 'ax' is the axes object containing the histogram
ax = plt.gca()
# Get the rectangles representing the bars
patches = ax.patches
# Extract the heights (frequencies) and the left edges of the rectangles
frequencies = [patch.get_height() for patch in patches]
bin_edges = [patch.get_xy()[0] for patch in patches]
# Calculate the probabilities
probabilities = frequencies
# Get density from Seaborn
x, y = sns.kdeplot(np.random.choice(bin_edges, size=10000, p=probabilities),clip=(min_x, max_x)).lines[0].get_data()
probabilities=y/sum(y)
mix_number=10
cell_dict={}
EB_cell_number=20000
total_cycles=10000
for i in range(1,total_cycles):
generated_data = np.random.choice(x, size=mix_number, p=probabilities)
cell_dict[i]=EB_cell_number/mix_number*generated_data
simulated_exp=pd.DataFrame(cell_dict)
simulated_exp = simulated_exp.div(simulated_exp.sum(axis=0), axis=1)>0.05
sns.set_style("whitegrid")
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )
sns.histplot(simulated_exp.sum(axis=0),binwidth=1, ax = ax)
<AxesSubplot: ylabel='Count'>
def empirical_dist(total, min_x, max_x):
plt.clf()
sns.histplot(total, stat='probability')
ax = plt.gca()
patches = ax.patches
frequencies = [patch.get_height() for patch in patches]
bin_edges = [patch.get_xy()[0] for patch in patches]
x, y = sns.kdeplot(np.random.choice(bin_edges, size=10000, p=frequencies),clip=(min_x, max_x)).lines[0].get_data()
probabilities=y/sum(y)
return x, probabilities
cell_dict={}
median_dict={}
min_x = 0
max_x = 9
mix_number=2
max_mix=20
EB_cell_number=20000
total_cycles=10000
nFinalCells = 100000
minCellsPerCelltype = 100
RarestCelltypeRate = 0.05
#what is the minimum number of cells per genotype given RarestCelltypeRate and minCellsPerCelltype required?
# RarestCelltypeRate(x100) : 100 = minCellsPerCelltype : X
MinCellsPerGenotype = (100*minCellsPerCelltype)/(RarestCelltypeRate*100)
MinCellsPerGenotype_Rate = MinCellsPerGenotype/nFinalCells
x, probabilities = empirical_dist(GrowthRate["D0_to_D4_growthRate"].tolist(), min_x, max_x)
x1, probabilities1 = empirical_dist(GrowthRate["D4_to_D10_growthRate"].tolist(), min_x, max_x)
k=0
while mix_number <= max_mix:
cell_dict[mix_number]={}
for i in range(1,total_cycles):
generated_data = np.random.choice(x, size=mix_number, p=probabilities)
generated_data_2 = np.random.choice(x1, size=mix_number, p=probabilities1)
if sum(generated_data)>mix_number:
k+=1
generated_cell=(EB_cell_number/mix_number)*generated_data
# if sum(generated_cell*generated_data_2)>sum(generated_cell):
# cell_dict[mix_number][k]=generated_cell*generated_data_2
cell_dict[mix_number][k]=generated_cell
simulated_exp=pd.DataFrame(cell_dict[mix_number])
plt.clf()
plot_data=(simulated_exp.div(simulated_exp.sum(axis=0), axis=1)>MinCellsPerGenotype_Rate).sum(axis=0)
testplot=sns.histplot(plot_data,discrete=True,stat='probability')
testplot.set(xlabel='Number of analysable genotypes', ylabel='Frequency')
testplot.set(xticks=range(0,15,1))
plt.title('Number of genotypes mixed: '+str(mix_number))
# plt.show()
plt.savefig(figDir+'/histogram_dual_'+str(mix_number)+'.png')
median_dict[mix_number]=[plot_data.mean()-plot_data.std(),plot_data.mean(),plot_data.mean()+plot_data.std()]
mix_number+=1
print(plot_data.std())
plt.clf()
data_samplings=pd.DataFrame(median_dict).T
data_samplings['x']=data_samplings.index
# PLot empirical distribution
sns.set_style("whitegrid")
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )
sns.set_style("white")
#plt.plot(data_samplings["x"], data_samplings[1], '-', linewidth = 3, label = "Median")
sns.lineplot(data=data_samplings,x="x",y=1,color="#FDA63A",
linewidth=3, ax = ax)
sns.lineplot(data=data_samplings,x="x",y=0,color="blue",
linewidth=1.5, ax = ax)
sns.lineplot(data=data_samplings,x="x",y=2,color="blue",
linewidth=1.5, ax = ax)
plt.fill_between(data_samplings["x"], data_samplings[0], data_samplings[2] ,
alpha=0.2, color='blue',linewidth=0.0)
#plt.xlabel('pseudotime', size=30)
ax.set_ylim([0,
round(data_samplings[2].max()+1) ])
ax.set_xlim([0, data_samplings["x"].max()])
ax.set_xticks([0, 2,5,10,20])
ax.set_xlabel('Mixed cell lines') # Y label
ax.set_ylabel('Recovered cell lines') # Y label
sns.despine(fig=ax.figure)
fig.savefig(figDir+"/MinGenotypeRate0.05.Simulation.svg", format='svg', bbox_inches='tight')
0.35022365621497326 0.5113858785630576 0.6360796124255776 0.7587491990292478 0.861063352678361 0.9588591510514366 1.0355798223004864 1.112408498986447 1.1675997689414286 1.2417317661082883 1.2796660181449084 1.3421836974746213 1.379552503611448 1.4277258582445909 1.4696310161549155 1.5129859381928021 1.5437397379132072 1.5503650190953209 1.6061752540136054
<Figure size 432x288 with 0 Axes>
adataComp = sc.read(outdir+"/adatas/adataPaga.h5ad")
leidenOrder=["ProliferatingProgenitors","RadialGliaProgenitors","OuterRadialGliaAstrocytes","CajalR_like","Neurons","MigratingNeurons","GlutamatergicNeurons_early","GlutamatergicNeurons_late","Interneurons_GAD2","Interneurons"]
for paradigm in adataComp.obs.type.unique():
adataCompStage = adataComp[adataComp.obs["type"] == paradigm]
compositions = pd.DataFrame(adataCompStage.obs.groupby(["leidenAnnotated","stage"]).size())
compositions = compositions.reset_index().rename(columns={0:"number_of_cells"})
compositions["cells_fraction"] = compositions["number_of_cells"] / np.array(compositions.groupby("stage")["number_of_cells"].sum()[compositions["stage"].tolist()])
compositions["cells_fraction"] = round(compositions["cells_fraction"],2)
fig = px.bar(compositions, x="stage", y="cells_fraction", color="leidenAnnotated", hover_data=compositions, text="cells_fraction",
category_orders={"stage":['early', 'mid', 'late'],
"leidenAnnotated":leidenOrder}, height=800,width=1000, template="plotly_white",title=paradigm,
color_discrete_map = dict(zip(adataCompStage.obs["leidenAnnotated"].cat.categories,adataCompStage.uns["leidenAnnotated_colors"]))
)
fig.update_traces(textangle=0, marker_line_color='rgb(8,48,107)', textposition = 'inside',
marker_line_width=1.5, opacity=1)
fig.update_layout(font=dict(size=25, color='black'),
yaxis = dict(tickfont = dict(size=30)),
xaxis = dict(tickfont = dict(size=30)))
fig.show()
cell_dict={}
median_dict={}
min_x = 0
max_x = 9
mix_number=2
max_mix=20
EB_cell_number=20000
total_cycles=100000
nFinalCells = 100000
minCellsPerCelltype = 100
RarestCelltypeRate = 0.09
#what is the minimum number of cells per genotype given RarestCelltypeRate and minCellsPerCelltype required?
# RarestCelltypeRate(x100) : 100 = minCellsPerCelltype : X
MinCellsPerGenotype = (100*minCellsPerCelltype)/(RarestCelltypeRate*100)
MinCellsPerGenotype_Rate = MinCellsPerGenotype/nFinalCells
print(MinCellsPerGenotype_Rate)
x, probabilities = empirical_dist(GrowthRate["D0_to_D4_growthRate"].tolist(), min_x, max_x)
x1, probabilities1 = empirical_dist(GrowthRate["D4_to_D10_growthRate"].tolist(), min_x, max_x)
k=0
while mix_number <= max_mix:
cell_dict[mix_number]={}
for i in range(1,total_cycles):
generated_data = np.random.choice(x, size=mix_number, p=probabilities)
generated_data_2 = np.random.choice(x1, size=mix_number, p=probabilities1)
if sum(generated_data)>mix_number:
k+=1
generated_cell=(EB_cell_number/mix_number)*generated_data
# if sum(generated_cell*generated_data_2)>sum(generated_cell):
# cell_dict[mix_number][k]=generated_cell*generated_data_2
cell_dict[mix_number][k]=generated_cell
simulated_exp=pd.DataFrame(cell_dict[mix_number])
plt.clf()
plot_data=(simulated_exp.div(simulated_exp.sum(axis=0), axis=1)>MinCellsPerGenotype_Rate).sum(axis=0)
testplot=sns.histplot(plot_data,discrete=True,stat='probability')
testplot.set(xlabel='Number of analysable genotypes', ylabel='Frequency')
testplot.set(xticks=range(0,15,1))
plt.title('Number of genotypes mixed: '+str(mix_number))
# plt.show()
median_dict[mix_number]=[plot_data.mean()-plot_data.std(),plot_data.mean(),
plot_data.mean()+plot_data.std(),
sms.DescrStatsW(plot_data).tconfint_mean(alpha=0.05)[0],
sms.DescrStatsW(plot_data).tconfint_mean(alpha=0.05)[1] ]
#median_dict[mix_number]=[plot_data.mean()-plot_data.std(),plot_data.mean(),plot_data.mean()+plot_data.std()]
mix_number+=1
print(plot_data.std())
plt.clf()
data_samplings=pd.DataFrame(median_dict).T
data_samplings['x']=data_samplings.index
data_samplings.columns = ["-1std","mean","+1std","95CI_Low","95CI_Up","x"]
data_samplings["-1std"] = data_samplings["-1std"].astype(float)
data_samplings["+1std"] = data_samplings["+1std"].astype(float)
data_samplings["mean"] = data_samplings["mean"].astype(float)
#%%
# PLot empirical distribution
sns.set_style("whitegrid")
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )
sns.set_style("white")
#plt.plot(data_samplings["x"], data_samplings[1], '-', linewidth = 3, label = "Median")
sns.lineplot(data=data_samplings,x="x",y="mean",color="#FDA63A",
linewidth=1, ax = ax,zorder=200)
sns.lineplot(data=data_samplings,x="x",y="-1std",color="blue",
linewidth=1.5, ax = ax)
sns.lineplot(data=data_samplings,x="x",y="+1std",color="blue",
linewidth=1.5, ax = ax)
plt.fill_between(data_samplings["x"], data_samplings["-1std"], data_samplings["+1std"] ,
alpha=0.2, color='blue',linewidth=0.0)
ax.set_ylim([0,
round(data_samplings["+1std"].max()+1) ])
ax.set_xlim([0, data_samplings["x"].max()])
ax.set_xticks([0, 2,5,10,20])
ax.set_xlabel('Mixed cell lines') # Y label
ax.set_ylabel('Recovered cell lines') # Y label
sns.despine(fig=ax.figure)
fig.savefig(figDir+"/MinGenotypeRate.Simulation.RarestCT:{}.svg".format(MinCellsPerGenotype_Rate), format='svg', bbox_inches='tight')
0.011111111111111112 0.2849344287427844 0.4208638181481634 0.5387833043648257 0.6584005055262023 0.7726536343098065 0.8731419499739559 0.966850234873297 1.051021984167527 1.12639100040836 1.1969438566333812 1.2664553836042336 1.3280871262657166 1.3813775188204362 1.4392394349617124 1.4920005681881165 1.5348624143887402 1.584403817770791 1.6253521422928991 1.666051530216503
<Figure size 360x504 with 0 Axes>
SamplingsTable = data_median[["mean","+1std","95CI_Low","95CI_Up","x"]].rename(columns={"x":"MixedLines","mean":"meanRecoveredLines"})
SamplingsTable["sd"] = SamplingsTable["+1std"] - SamplingsTable["meanRecoveredLines"]
SamplingsTable = SamplingsTable.drop(columns=["+1std"])
SamplingsTable.to_excel(tablesDir+"/MinGenotypeRate.Simulation.RarestCT:{}.xlsx".format(round(MinCellsPerGenotype_Rate,2)), index=False)
from scipy.optimize import curve_fit
# let's fit a power function to the data
def powerFun(x, a, b, c):
y = a * np.power(x, b) + c
return y
continous_cuntSpace = np.linspace(min(data_samplings['x']), max(data_samplings['x']), 100)
FittedValues = pd.DataFrame(index = continous_cuntSpace)
fittedParams = curve_fit(powerFun, data_samplings.index.tolist(),data_samplings[1])[0]
FittedValues["Fitted_log_RarestCelltypeRate:{}".format(RarestCelltypeRate)] = powerFun(continous_cuntSpace, *fittedParams)
#sns.lineplot(continous_cuntSpace, powerFun(continous_cuntSpace, *fittedParams), label="powerFun")
fittedY = powerFun(np.array([2,5,10,20]), *fittedParams)
print(fittedY)
from matplotlib import pylab
from matplotlib.pyplot import figure
sns.set_style("whitegrid")
pylab.rcParams['figure.figsize'] = (5, 7)
FittedValues["MixedGenotypes"] = FittedValues.index.tolist()
plot = FittedValues.melt("MixedGenotypes", value_name="Recovered Genotypes").copy()
plot = plot[plot["variable"] == "Fitted_log_RarestCelltypeRate:{}".format(RarestCelltypeRate)]
sns.set_style("ticks")
fig,ax = plt.subplots(ncols=1,nrows=1, figsize=(7, 5),dpi=100 )
sns.lineplot(data=plot,x="MixedGenotypes",y="Recovered Genotypes",hue="variable",palette="Set2",
linewidth=3, ax = ax)
#ax.get_legend().remove()# To remove the legend
# ax.set_ylabel('SHH expressing cells', fontsize = 30) # Y label
# ax.set_xlabel('genotype', fontsize = 30) # X label
ax.set_ylim([0,
round(data_samplings[2].max()+1) ])
ax.set_xlim([0, (max_mix+1)])
#ax.set_yticks([0,.25,.5,.75,1])
fig.suptitle('')
ax.set_xlabel('Mixed cell lines') # Y label
ax.set_ylabel('Recovered cell lines') # Y label
ax.set_xticks([0, 2,5,10,20])
### Pick the vertical lines to plot
vlinelist = [5,10,20]
#fittedParams = curve_fit(powerFun, finalDF.index.tolist(),finalDF["Fitted_log_RarestCelltypeRate:{}".format(RarestCelltypeRate)])[0]
predRecover = powerFun(np.array(vlinelist), *fittedParams)
vlineDict = dict(zip(vlinelist,predRecover))
vlineDictDF = pd.DataFrame(vlineDict, index=["Recovered"]).T
vlineDictDF["mixedLines"] = vlineDictDF.index
for vl in list(vlineDict.keys()):
plt.vlines(x=vl,ymax=vlineDict[vl],ymin=0, color='#e6e6e6',linestyle='--')
plt.hlines(y=vlineDict[vl],xmax=vl,xmin=0, color='magenta',linestyle='-',linewidth = .1 )
ax.set_title('Number of recovered cell lines with more than {}% representation'.format(round(MinCellsPerGenotype_Rate*100)))
sns.scatterplot(data = vlineDictDF, x = "mixedLines", y = "Recovered", color = "magenta",
s = 50,ax = ax, zorder=100)
ax.get_legend().remove()
sns.despine(fig=ax.figure)
fig.savefig(figDir+"/MinGenotypeRate.Fitted.RarestCT:{}.svg".format(RarestCelltypeRate), format='svg', bbox_inches='tight')
[ 1.83116391 4.48099458 7.77981212 12.8719096 ]